• Show All Code
  • Hide All Code

violation analysis

Restaurant group

2024-12-02

Analysis related to violation of Manhattan Restaurants

library(tidyverse)
library(plotly)
library(ggplot2)
library(knitr)
library(readr)
library(dplyr)
library(tidyr)
library(scales)

• N = Not Yet Graded• A = Grade A• B = Grade B• C = Grade C• Z = Grade Pending• P= Grade Pending issued on re-opening following an initial inspection that resulted in a closure

Indicator of critical violation; “• Critical • Not Critical • Not Applicable”; Critical violations are those most likely to contribute to food-borne illness

# The same data cleaning process in demographics part, also only consider data with valid violation_description and critical_flag, and grade A, B, C.

manhattan_data = read_csv("Manhattan_Restaurant_Inspection_Results.csv", na = c("NA", "", "."))
str (manhattan_data)
## spc_tbl_ [94,616 × 27] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ CAMIS                : num [1:94616] 50140436 50158081 50152703 50160975 50161087 ...
##  $ DBA                  : chr [1:94616] "JUST SALAD" "THE MANNER" "MIDNIGHT BLUE" "BLUE BLOSSOM" ...
##  $ BORO                 : chr [1:94616] "Manhattan" "Manhattan" "Manhattan" "Manhattan" ...
##  $ BUILDING             : chr [1:94616] "2853" "58" "106" "108" ...
##  $ STREET               : chr [1:94616] "BROADWAY" "THOMPSON STREET" "EAST   19 STREET" "WEST   39 STREET" ...
##  $ ZIPCODE              : num [1:94616] 10025 10012 10003 10018 10025 ...
##  $ PHONE                : num [1:94616] 7.32e+09 9.17e+09 3.48e+09 6.47e+09 9.29e+09 ...
##  $ CUISINE DESCRIPTION  : chr [1:94616] NA NA NA NA ...
##  $ INSPECTION DATE      : chr [1:94616] "01/01/1900" "01/01/1900" "01/01/1900" "01/01/1900" ...
##  $ ACTION               : chr [1:94616] NA NA NA NA ...
##  $ VIOLATION CODE       : chr [1:94616] NA NA NA NA ...
##  $ VIOLATION DESCRIPTION: chr [1:94616] NA NA NA NA ...
##  $ CRITICAL FLAG        : chr [1:94616] "Not Applicable" "Not Applicable" "Not Applicable" "Not Applicable" ...
##  $ SCORE                : num [1:94616] NA NA NA NA NA NA NA NA NA NA ...
##  $ GRADE                : chr [1:94616] NA NA NA NA ...
##  $ GRADE DATE           : chr [1:94616] NA NA NA NA ...
##  $ RECORD DATE          : chr [1:94616] "11/05/2024" "11/05/2024" "11/05/2024" "11/05/2024" ...
##  $ INSPECTION TYPE      : chr [1:94616] NA NA NA NA ...
##  $ Latitude             : num [1:94616] 40.8 40.7 40.7 40.8 40.8 ...
##  $ Longitude            : num [1:94616] -74 -74 -74 -74 -74 ...
##  $ Community Board      : num [1:94616] 109 102 105 105 107 108 101 105 104 102 ...
##  $ Council District     : chr [1:94616] "07" "01" "02" "04" ...
##  $ Census Tract         : chr [1:94616] "019900" "004700" "005000" "011300" ...
##  $ BIN                  : num [1:94616] 1075440 1087362 1017905 1015273 1055676 ...
##  $ BBL                  : num [1:94616] 1.02e+09 1.00e+09 1.01e+09 1.01e+09 1.02e+09 ...
##  $ NTA                  : chr [1:94616] "MN09" "MN24" "MN21" "MN17" ...
##  $ Location Point1      : logi [1:94616] NA NA NA NA NA NA ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   CAMIS = col_double(),
##   ..   DBA = col_character(),
##   ..   BORO = col_character(),
##   ..   BUILDING = col_character(),
##   ..   STREET = col_character(),
##   ..   ZIPCODE = col_double(),
##   ..   PHONE = col_double(),
##   ..   `CUISINE DESCRIPTION` = col_character(),
##   ..   `INSPECTION DATE` = col_character(),
##   ..   ACTION = col_character(),
##   ..   `VIOLATION CODE` = col_character(),
##   ..   `VIOLATION DESCRIPTION` = col_character(),
##   ..   `CRITICAL FLAG` = col_character(),
##   ..   SCORE = col_double(),
##   ..   GRADE = col_character(),
##   ..   `GRADE DATE` = col_character(),
##   ..   `RECORD DATE` = col_character(),
##   ..   `INSPECTION TYPE` = col_character(),
##   ..   Latitude = col_double(),
##   ..   Longitude = col_double(),
##   ..   `Community Board` = col_double(),
##   ..   `Council District` = col_character(),
##   ..   `Census Tract` = col_character(),
##   ..   BIN = col_double(),
##   ..   BBL = col_double(),
##   ..   NTA = col_character(),
##   ..   `Location Point1` = col_logical()
##   .. )
##  - attr(*, "problems")=<externalptr>

cleaned_data = manhattan_data %>%
  janitor::clean_names() %>%  
  filter(
    !is.na(dba),                         
    !is.na(cuisine_description),       
    !is.na(grade),                       
    !is.na(score),                       
    !is.na(zipcode),
    !is.na(violation_description),
    !is.na(critical_flag),
    grade %in% c("A", "B", "C")
  ) %>% mutate(region = case_when(
    zipcode >= 10000 & zipcode <= 10025 ~ "Downtown",
    zipcode >= 10026 & zipcode <= 10040 ~ "Midtown",
    zipcode >= 10041 & zipcode <= 10282 ~ "Uptown",
    TRUE ~ "Other" # For ZIP codes like 11371, 12345, etc.
  ))
  1. Effect of violations on Grade (A,B,C…) Effect of Critical Violations on Grades Analysis: Examine how critical violations affect the overall grade a restaurant receives. This can help show if critical violations are strongly correlated with lower grades. Visualization: Grouped bar chart showing average scores and grades for restaurants with and without critical violations (based on CRITICAL FLAG).
# Analyze the effect of violation_description on grade
# Group by critical flag and grade to calculate counts
violation_grade_summary <- cleaned_data %>%
  group_by(critical_flag, grade) %>%
  summarize(count = n(), .groups = "drop")

# Calculate average scores by CRITICAL_FLAG and grade
average_score <- cleaned_data %>%
  group_by(critical_flag, grade) %>%
  summarize(avg_score = mean(score), .groups = "drop")


# Create an EDA using plotly
violation_grade = violation_grade_summary %>% 
  plot_ly(
    x = ~grade, 
    y = ~count,
    type = 'bar',
    color = ~critical_flag,
    colors = "viridis") %>% 
  layout(
    title = "Effect of Violation Type on Restaurant Grade",
    xaxis = list(title = "Grade"),   
    yaxis = list(title = "Restaurant Number") 
  )

violation_score = average_score %>% 
  plot_ly(
    x = ~grade, 
    y = ~avg_score,
    type = 'bar',
    color = ~critical_flag,
    colors = "viridis") %>% 
  layout(
    title = "Effect of Violation Type on Restaurant Score",
    xaxis = list(title = "Grade"),   
    yaxis = list(title = "Restaurant Average Score") 
  )

Chi-Square Test between critical_flag and grade: conclusion: p-value<0.05, reject null, they are significant related.

contingency_table <- table(cleaned_data$critical_flag, cleaned_data$grade)

chi_test <- chisq.test(contingency_table)

print(chi_test)
## 
##  Pearson's Chi-squared test
## 
## data:  contingency_table
## X-squared = 821.92, df = 2, p-value < 2.2e-16
  1. Restaurant Type vs. Violation Frequency (Spearman): Explore whether certain restaurant types are associated with a higher frequency of violations. find which restaurant type has largest precent of critical violation.
# Group by cuisine_description and calculate total and critical counts
violation_cuisine <- cleaned_data %>%
  mutate(critical_flag = ifelse(critical_flag == "Critical", 1, 0)) %>%
  group_by(cuisine_description) %>%
  summarize(total_violations = n(),
            critical_violations = sum(critical_flag, na.rm = TRUE),
            critical_percent = (critical_violations / total_violations),
            critical_percentage = percent(critical_percent, accuracy = 0.01)
            ) %>%
  arrange(desc(critical_percentage))  %>% # Sort by percentage of critical violations 
  head(10) # View top 10 restaurant types with the highest percentage of critical violations

plot_ly(
  type = 'table',
  header = list(
    values = c("Cuisine Description", "Critical Percentage"),
    align = c("center", "center"),
    font = list(size = 14, color = "white"),
    fill = list(color = "purple")
  ),
  cells = list(
    values = rbind(violation_cuisine$cuisine_description, violation_cuisine$critical_percentage),
    align = c("center", "center"),
    font = list(size = 12),
    fill = list(color = c("white", "white"))
  )
)

what type of violations are most comment in Chinese/Japanese cuisine:

chinese_japanese <- cleaned_data %>%
  filter(cuisine_description == "Chinese/Japanese") %>% # 42 rows in total
  group_by(violation_description) %>%
  summarize(num_violation = n(),
            violation_percent = num_violation / 42,  
            violation_percentage = percent(violation_percent, accuracy = 0.01)) %>%
  arrange(desc(violation_percent))

what type of violations are most comment in critical violation: